import plotly
plotly.offline.init_notebook_mode()
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
data_iris = load_iris()
df_iris = pd.DataFrame(data_iris.data,columns=data_iris.feature_names)
df_iris
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 |
| ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 |
150 rows × 4 columns
df_iris['y'] = data_iris.target_names[data_iris.target] == 'virginica'
df_iris.head()
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | y | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | False |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | False |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | False |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | False |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | False |
virginica = df_iris[df_iris['y'] == True]
not_virginica = df_iris[df_iris['y'] == False]
virginica.describe()
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
|---|---|---|---|---|
| count | 50.00000 | 50.000000 | 50.000000 | 50.00000 |
| mean | 6.58800 | 2.974000 | 5.552000 | 2.02600 |
| std | 0.63588 | 0.322497 | 0.551895 | 0.27465 |
| min | 4.90000 | 2.200000 | 4.500000 | 1.40000 |
| 25% | 6.22500 | 2.800000 | 5.100000 | 1.80000 |
| 50% | 6.50000 | 3.000000 | 5.550000 | 2.00000 |
| 75% | 6.90000 | 3.175000 | 5.875000 | 2.30000 |
| max | 7.90000 | 3.800000 | 6.900000 | 2.50000 |
not_virginica.describe()
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
|---|---|---|---|---|
| count | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
| mean | 5.471000 | 3.099000 | 2.861000 | 0.786000 |
| std | 0.641698 | 0.478739 | 1.449549 | 0.565153 |
| min | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
| 25% | 5.000000 | 2.800000 | 1.500000 | 0.200000 |
| 50% | 5.400000 | 3.050000 | 2.450000 | 0.800000 |
| 75% | 5.900000 | 3.400000 | 4.325000 | 1.300000 |
| max | 7.000000 | 4.400000 | 5.100000 | 1.800000 |
import seaborn as sns
import matplotlib.pyplot as plt
for col in df_iris.columns[0:4]:
sns.histplot(data=df_iris,hue='y',x=col,bins=10,palette='viridis')
plt.legend(['Not Virginica','Virginica'])
plt.show()
d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\seaborn\_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
df_iris_corr = df_iris[df_iris.columns[0:4]].corr()
print(df_iris_corr)
plt.figure(figsize=(12,10))
sns.heatmap(df_iris_corr, annot=True)
plt.title('Heatmap of Correlation Matrix of Iris Dataset')
plt.show()
sepal length (cm) sepal width (cm) petal length (cm) \
sepal length (cm) 1.000000 -0.117570 0.871754
sepal width (cm) -0.117570 1.000000 -0.428440
petal length (cm) 0.871754 -0.428440 1.000000
petal width (cm) 0.817941 -0.366126 0.962865
petal width (cm)
sepal length (cm) 0.817941
sepal width (cm) -0.366126
petal length (cm) 0.962865
petal width (cm) 1.000000
import plotly.express as px
custom_colors = ['#1f77b4', 'cornflowerblue']
# Create a pie chart with custom colors
fig_pie = px.pie(values=df_iris['y'].value_counts(),
names=['Not Virginica','Virginica'],
title='Pie Chart of Species Counts',
color_discrete_sequence=custom_colors)
# Show the pie chart
fig_pie.show()
Observation :
fig_boxplot = px.box(df_iris, x="y", y="sepal length (cm)", title="Box Plot of Sepal Length by Species",
color='y', # Set box color based on Species
color_discrete_sequence=custom_colors, # Custom color sequence
)
# Update layout
fig_boxplot.update_layout(xaxis_title="Species", yaxis_title="Sepal Length",xaxis=dict(showticklabels=False))
# Update legends
fig_boxplot.update_traces(name = "Virginica", selector=dict(name="True"))
fig_boxplot.update_traces(name = "Non Virginica", selector=dict(name="False"))
# Show the box plot
fig_boxplot.show()
d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\plotly\express\_core.py:1985: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
from matplotlib_venn import venn2
sepal_length = df_iris.iloc[:,0]
sepal_width = df_iris.iloc[:,1]
petal_length = df_iris.iloc[:,2]
petal_width = df_iris.iloc[:,3]
venn2(subsets = (len(sepal_length)-15,len(sepal_width)-15,15),set_labels = ("sepal_length","sepal_width"))
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_old, y_train, y_old = train_test_split(df_iris[df_iris.columns[0:4]], df_iris['y'], test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_old, y_old, test_size=0.5, random_state=42)
# Select features randomly
np.random.seed(42)
feature_1 = np.random.choice(df_iris.columns[0:4],1)
feature_2 = np.random.choice(df_iris.columns[0:4],2,replace=False)
feature_3 = np.random.choice(df_iris.columns[0:4],3,replace=False)
feature_4 = np.array(df_iris.columns[0:4])
# List the features
print(f'Feature 1:{feature_1}')
print(f'Feature 2:{feature_2}')
print(f'Feature 3:{feature_3}')
print(f'Feature 4:{feature_4}')
Feature 1:['petal length (cm)'] Feature 2:['sepal width (cm)' 'petal length (cm)'] Feature 3:['sepal width (cm)' 'petal width (cm)' 'sepal length (cm)'] Feature 4:['sepal length (cm)' 'sepal width (cm)' 'petal length (cm)' 'petal width (cm)']
from sklearn.linear_model import LogisticRegression
# Create a dictionary to store the models
models = {}
for i,features in enumerate([feature_1,feature_2,feature_3,feature_4]):
# Create a logistic regression model
model = LogisticRegression(random_state=42)
# Train the model
model.fit(X_train[features], y_train)
# Store the model in the dictionary
models[f'model_{i+1}'] = model
models
{'model_1': LogisticRegression(random_state=42),
'model_2': LogisticRegression(random_state=42),
'model_3': LogisticRegression(random_state=42),
'model_4': LogisticRegression(random_state=42)}
from sklearn.metrics import accuracy_score
featuures_list = [feature_1,feature_2,feature_3,feature_4]
# loop through the models
for i, (model_name, model) in enumerate(models.items()):
features = featuures_list[i]
# Make predictions
y_pred = model.predict(X_val[features])
# probability predictions
y_prob = model.predict_proba(X_val[features])[:,1]
#Each table should have four columns:
#instance number, probability of predicting verginica, actual prediction by the model, ground truth.
# Create a dataframe
result = {
'Instance No':X_val.index,
'Probability':y_prob,
'Prediction':y_pred,
'Ground_truth':y_val
}
result_df = pd.DataFrame(result,columns=result.keys())
print(f'\nTable for Model {i+1}:\n{result_df}')
# Summarize the data in each table to a single measure
accuracy = accuracy_score(y_val, y_pred)
print(f'\nAccuracy for Model {i+1}: {accuracy}')
Table for Model 1:
Instance No Probability Prediction Ground_truth
26 26 0.000022 False False
18 18 0.000031 False False
118 118 0.998699 True True
145 145 0.745829 True True
78 78 0.228705 False False
127 127 0.523520 True True
108 108 0.954400 True True
55 55 0.228705 False False
30 30 0.000022 False False
29 29 0.000022 False False
141 141 0.678970 True True
110 110 0.678970 True True
19 19 0.000016 False False
132 132 0.915775 True True
64 64 0.015327 False False
Accuracy for Model 1: 1.0
Table for Model 2:
Instance No Probability Prediction Ground_truth
26 26 0.000014 False False
18 18 0.000015 False False
118 118 0.999003 True True
145 145 0.736130 True True
78 78 0.222408 False False
127 127 0.506573 True True
108 108 0.964419 True True
55 55 0.232045 False False
30 30 0.000016 False False
29 29 0.000015 False False
141 141 0.654265 True True
110 110 0.641746 True True
19 19 0.000008 False False
132 132 0.921911 True True
64 64 0.014055 False False
Accuracy for Model 2: 1.0
Table for Model 3:
Instance No Probability Prediction Ground_truth
26 26 0.002719 False False
18 18 0.002539 False False
118 118 0.975471 True True
145 145 0.933097 True True
78 78 0.315289 False False
127 127 0.582945 True True
108 108 0.755953 True True
55 55 0.157248 False False
30 30 0.001358 False False
29 29 0.001177 False False
141 141 0.938725 True True
110 110 0.777825 True True
19 19 0.001582 False False
132 132 0.896727 True True
64 64 0.139172 False False
Accuracy for Model 3: 1.0
Table for Model 4:
Instance No Probability Prediction Ground_truth
26 26 0.000009 False False
18 18 0.000006 False False
118 118 0.998534 True True
145 145 0.873922 True True
78 78 0.207005 False False
127 127 0.572730 True True
108 108 0.946564 True True
55 55 0.170670 False False
30 30 0.000008 False False
29 29 0.000008 False False
141 141 0.820222 True True
110 110 0.728198 True True
19 19 0.000004 False False
132 132 0.956114 True True
64 64 0.016196 False False
Accuracy for Model 4: 1.0
# Calculate decision boundary for model_1
decision_boundary_f1 = -models['model_1'].intercept_ / models['model_1'].coef_
print(f'Decision Boundary for Model 1: {decision_boundary_f1}')
# Plot the decision boundary
plt.figure(figsize=(10,6))
plt.scatter(X_val[feature_1], y_val, c=y_val, cmap='viridis')
plt.axvline(decision_boundary_f1, color='black', linestyle='--', label='Decision Boundary')
plt.title('Decision Boundary for Feature 1')
plt.xlabel(feature_1)
plt.ylabel('Species')
plt.legend()
plt.show()
Decision Boundary for Model 1: [[4.87124795]]
feat_2_min = min(X_val[feature_2[0]].min(), X_val[feature_2[1]].min())
feat_2_max = max(X_val[feature_2[0]].max(), X_val[feature_2[1]].max())
# plot the data
plt.scatter(X_val[feature_2[0]], X_val[feature_2[1]], c=y_val, cmap='viridis', edgecolors='k')
# plot decison boundary
decision_boundary_f2_x1 = np.linspace(feat_2_min,feat_2_max, 10)
decision_boundary_f2_x2 = -models['model_2'].intercept_ / models['model_2'].coef_[0][1] - models['model_2'].coef_[0][0] / models['model_2'].coef_[0][1] * decision_boundary_f2_x1
plt.plot(decision_boundary_f2_x1, decision_boundary_f2_x2)
plt.title('Decision Boundary for Feature 2')
plt.xlabel(feature_2[0])
plt.ylabel(feature_2[1])
plt.show()
from mpl_toolkits.mplot3d import Axes3D
feat_3_min = min(X_val[feature_3[0]].min(), X_val[feature_3[1]].min(),X_val[feature_3[2]].min())
feat_3_max = max(X_val[feature_3[0]].max(), X_val[feature_3[1]].max(),X_val[feature_3[2]].max())
x1, x2 = np.meshgrid(np.linspace(feat_3_min,feat_3_max, 10), np.linspace(feat_3_min,feat_3_max, 10))
decision_boundary_f3_x3 = -(models['model_3'].intercept_ / models['model_3'].coef_[0][2]) - models['model_3'].coef_[0][0] / models['model_3'].coef_[0][2] * x1 - models['model_3'].coef_[0][1] / models['model_3'].coef_[0][2] * x2
fig = plt.figure(figsize=(20,15))
### Set figure size
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_val[feature_3[0]], X_val[feature_3[1]], X_val[feature_3[2]], color='r') # add data points
# add data points
ax.plot_surface(x1, x2, decision_boundary_f3_x3)
plt.title('Decision Boundary for Feature 3')
ax.set_xlabel(feature_3[0])
ax.set_ylabel(feature_3[1])
ax.set_zlabel(feature_3[2])
plt.show()
# Predict on the test set
test_preds = models['model_2'].predict(X_test[feature_2])
# Calculate accuracy on the test set
test_accuracy = np.mean(test_preds == y_test)
# Output the test accuracy
print(f'Test Accuracy: {test_accuracy}')
Test Accuracy: 1.0